3. Pathway Enrichment Analysis¶

  • Extracting Biological Insights from top important features of subtypes
In [1]:
from IPython.display import display, Image
Image("../img/cellular_molecular_processes.png", width=800) 
Out[1]:
No description has been provided for this image
In [2]:
import requests 
import json
import pandas as pd 

def identifiers(ids='EGF,EGFR', interactors=False, page_size='1', page='1', species='Homo Sapiens',
                sort_by='ENTITIES_FDR', order='ASC', resource='TOTAL', p_value='1', include_disease=True,
                min_entities=None, max_entities=None, projection=False, importable_only=False):
    """
    Given a list of protein, gene, or small molecule identifiers conducts reactome pathway enrichment analysis.

    :param ids: comma seperated list of proteins, genes or small molecules identifiers symbol in string format ex. 'EGF,EGFR'
    :param interactors: boolean value indicating include interations
    :param page_size: page size
    :param page: number of pages
    :param species: list of species to filter the result (accepts taxonomy ids, species names and dbId)
    :param sort_by: how to sort the result. Available filters: TOTAL_ENTITIES, TOTAL_REACTIONS, TOTAL_INTERACTIONS,
        FOUND_ENTITIES, FOUND_INTERACTIONS, FOUND_REACTIONS, ENTITIES_RATIO, ENTITIES_PVALUE, ENTITIES_FDR, REACTIONS_RATIO
    :param order: order ASC or DESC
    :param resource: the resource to sort TOTAL, UNIPORT, ENSEMBLE, CHEMBI, IUPHAR, MIRBASE, NCBI_PROTEIN, EMBL, COMPOUND, PUBCEM_COMPOUND
    :param p_value: defines the pValue threshold. Only hit pathway with pValue equals or below the threshold will be returned
    :param include_disease: set to ‘false’ to exclude the disease pathways from the result (it does not alter the statistics)
    :param projection: if true, projects the identifiers to human and only shows the result in this species
    :param max_entities: maximum number of contained entities per pathway (takes into account the resource)
    :param min_entities: minimum number of contained entities per pathway (takes into account the resource)
    :return: Json dictionary object
    """
    if isinstance(page_size, NumberTypes):
        page_size = str(page_size)

    if isinstance(page, NumberTypes):
        page = str(page)

    if isinstance(p_value, NumberTypes):
        p_value = str(p_value)

    if isinstance(min_entities, NumberTypes):
        min_entities = str(min_entities)

    if isinstance(max_entities, NumberTypes):
        max_entities = str(max_entities)

    if interactors:
        interactors = 'true'
    else:
        interactors = 'false'

    if include_disease:
        include_disease = 'true'
    else:
        include_disease = 'false'

    headers = {
        'accept': 'application/json',
        'content-type': 'text/plain',
    }

    params = (
        ('interactors', interactors),
        ('pageSize', page_size),
        ('page', page),
        ('sortBy', sort_by),
        ('order', order),
        ('species',  species),
        ('resource', resource),
        ('pValue', p_value),
        ('includeDisease', include_disease),
        ('min', min_entities),
        ('max', max_entities),
        ('importableOnly', importable_only)
    )

    if projection:
        url = 'https://reactome.org/AnalysisService/identifiers/projection'
    else:
        url = 'https://reactome.org/AnalysisService/identifiers/'

    data = ids

    try:
        response = requests.post(url=url, headers=headers, params=params, data=data)
    except ConnectionError as e:
        print(e)

    if response.status_code == 200:
        return response.json()
    else:
        print('Status code returned a value of %s' % response.status_code)

def token(token, species='Homo sapiens', page_size='1', page='1', sort_by='ENTITIES_FDR', order='ASC', resource='TOTAL',
          p_value='1', include_disease=True, min_entities=None, max_entities=None):
    """
    Returns the result associated with token.
    Use page and pageSize to reduce the amount of data retrieved. Use sortBy and order to sort the result by your preferred option.
    The resource field will filter the results to show only those corresponding to the preferred molecule type (TOTAL includes all the different molecules type)

    :param token: The token associated with the data result - analysis Web-Service is token based, so for every analysis
        request a TOKEN is associated to the result
    :param species: List of species to filter the result (accepts taxonomy ids, species names and reactome dbId)
    :param page_size: Page size
    :param page: Page number
    :param sort_by: How to sort the result. Available filters: TOTAL_ENTITIES, TOTAL_REACTIONS, TOTAL_INTERACTIONS,
        FOUND_ENTITIES, FOUND_INTERACTIONS, FOUND_REACTIONS, ENTITIES_RATIO, ENTITIES_PVALUE, ENTITIES_FDR, REACTIONS_RATIO
    :param order: Order ASC or DESC
    :param resource: The resource to sort TOTAL, UNIPORT, ENSEMBLE, CHEMBI, IUPHAR, MIRBASE, NCBI_PROTEIN, EMBL, COMPOUND, PUBCEM_COMPOUND
    :param p_value: Defines the pValue threshold. Only hit pathway with pValue equals or below the threshold will be returned
    :param include_disease: Set to ‘false’ to exclude the disease pathways from the result (it does not alter the statistics)
    :param min_entities: Minimum number of contained entities per pathway (takes into account the resource)
    :param max_entities: Maximum number of contained entities per pathway (takes into account the resource)
    :return: Json dictionary object
    """
    if isinstance(page_size, NumberTypes):
        page_size = str(page_size)

    if isinstance(page, NumberTypes):
        page = str(page)

    if isinstance(p_value, NumberTypes):
        p_value = str(p_value)

    if isinstance(min_entities, NumberTypes):
        min_entities = str(min_entities)

    if isinstance(max_entities, NumberTypes):
        max_entities = str(max_entities)

    if include_disease:
        include_disease = 'true'
    else:
        include_disease = 'false'

    headers = {
        'accept': 'application/json',
    }

    params = (
        ('pageSize', page_size),
        ('page', page),
        ('sortBy', sort_by),
        ('order', order),
        ('species',  species),
        ('resource', resource),
        ('pValue', p_value),
        ('includeDisease', include_disease),
        ('min', min_entities),
        ('max', max_entities),
    )

    url = 'https://reactome.org/AnalysisService/token/%s' % token

    try:
        response = requests.get(url=url, headers=headers, params=params)
    except ConnectionError as e:
        print(e)

    if response.status_code == 200:
        return response.json()
    else:
        print('Status code returned a value of %s' % response.status_code)


def report(token, path, file='report.pdf', number='25', resource='TOTAL', diagram_profile='Modern', analysis_profile='Standard',
                fireworks_profile='Barium Lithium', species='Homo sapiens', chunk_size=128):
    """
    Downloads a report for a given pathway analysis result

    :param token: The token associated with the data result - analysis Web-Service is token based, so for every analysis
        request a TOKEN is associated to the result
    :param path: Absolute path to save the report pdf file to
    :param file: Pdf file name to save the analysis report to - default set to report.pdf
    :param number: Number of pathways reported (max 50)
    :param resource: The resource to sort TOTAL, UNIPORT, ENSEMBLE, CHEMBI, IUPHAR, MIRBASE, NCBI_PROTEIN, EMBL, COMPOUND, PUBCEM_COMPOUND
    :param diagram_profile: Diagram Color Profile - as string
    :param analysis_profile: Analysis Color Profile - as string
    :param fireworks_profile: Diagram Color Profile - as string
    :param species: The species for which results will be reported
    :param chunk_size: Python generator iter_content() chunk size - default set to 128
    :return: Saves a reactome analysis pdf report to the indicated path and file name
    """
    if isinstance(number, NumberTypes):
        number = str(number)

    headers = {
        'accept': 'application/pdf',
    }

    params = (
        ('number', number),
        ('resource', resource),
        ('diagramProfile', diagram_profile),
        ('analysisProfile', analysis_profile),
        ('fireworksProfile', fireworks_profile),
    )

    try:
        response = requests.get('https://reactome.org/AnalysisService/report/%s/%s/%s' % (token, species, file),
                                headers=headers, params=params)
    except ConnectionError as e:
        print(e)

    if response.status_code == 200:
        with open("".join([path, file]), 'wb') as f:
            for chunk in response.iter_content(chunk_size=chunk_size):
                f.write(chunk)
    else:
        print('Status code returned a value of %s' % response.status_code)

NumberTypes = (int, float, complex)
In [3]:
input_filename = "../data/selected_features_by_subtype.json"
with open(input_filename, "r") as f:
    selected_features = json.load(f)

subtype_of_interest = 'Basal'
_ids = ",".join(selected_features[subtype_of_interest]) # comma seperated list of gene features in string... 

result = identifiers(ids=_ids, interactors=False, page_size='1', page='1', species='Homo Sapiens',
                sort_by='ENTITIES_FDR', order='ASC', resource='TOTAL', p_value='0.05', include_disease=True,
                min_entities=None, max_entities=None, projection=False)

_token = result['summary']['token']

token_result = token(_token, species='Homo sapiens', page_size='-1', page='-1', sort_by='ENTITIES_FDR',
                                      order='ASC', resource='TOTAL', p_value='0.05', include_disease=False,
                                      min_entities=None, max_entities=None)

enrichment_analysis = [p for p in token_result['pathways']]
_names = [(e['name'], e['entities']['pValue'], e['entities']['total'], e['entities']['found']) for e in enrichment_analysis]
df = pd.DataFrame(_names, columns=['Pathway name', 'pValue', 'total', 'found'])
df = df.sort_values(by='pValue', ascending=True)
print("\nTCGA-BRCA Basal subtype features pathway enrichment analysis: \n\n", df)

x = Image("../img/PathwaysOverview.png") 
y = Image("../img/Reacfoam.jpg") 
display(x, y) 
TCGA-BRCA Basal subtype features pathway enrichment analysis: 

                                          Pathway name    pValue  total  found
0             APC-Cdc20 mediated degradation of Nek2A  0.000342     26      4
1   Inactivation of APC/C via direct inhibition of...  0.002409     21      3
2   Inhibition of the proteolytic activity of APC/...  0.002409     21      3
3   APC:Cdc20 mediated degradation of cell cycle p...  0.007419     61      4
4   APC/C:Cdc20 mediated degradation of mitotic pr...  0.008282     63      4
5   Activation of APC/C and APC/C:Cdc20 mediated d...  0.008737     64      4
6    TP53 Regulates Transcription of Cell Cycle Genes  0.009208     65      4
7                   NEIL3-mediated resolution of ICLs  0.012352      1      1
8    RUNX2 regulates genes involved in cell migration  0.013438     14      2
10  APC/C-mediated degradation of cell cycle proteins  0.017597     79      4
9                    Regulation of mitotic cell cycle  0.017597     79      4
11                       Phosphorylation of the APC/C  0.026129     20      2
12  TP53 Regulates Transcription of Genes Involved...  0.026129     20      2
13  TP53 Regulates Transcription of Genes Involved...  0.028577     21      2
14  Conversion from APC/C:Cdc20 to APC/C:Cdh1 in l...  0.031113     22      2
15       APC/C:Cdc20 mediated degradation of Cyclin B  0.036438     24      2
17                   Chondroitin sulfate biosynthesis  0.039223     25      2
16  Cdc20:Phospho-APC/C mediated degradation of Cy...  0.039623     60      3
18  Regulation of MITF-M-dependent genes involved ...  0.048037     28      2
No description has been provided for this image
No description has been provided for this image